NOTE Before starting this activity please remember to clear your environment.
rm(list = ls(all=TRUE))
Read the dataset
Data pre-processing
Explore the dataset
# Use the setwd() function to get to the directory where the data is present
cereals_data <- read.csv('Cereals.csv', header = T)
str(cereals_data)
## 'data.frame': 77 obs. of 14 variables:
## $ name : Factor w/ 77 levels "100%_Bran","100%_Natural_Bran",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ calories: int 70 120 70 50 110 110 110 130 90 90 ...
## $ protein : int 4 3 4 4 2 2 2 3 2 3 ...
## $ fat : int 1 5 1 0 2 2 0 2 1 0 ...
## $ sodium : int 130 15 260 140 200 180 125 210 200 210 ...
## $ fiber : num 10 2 9 14 1 1.5 1 2 4 5 ...
## $ carbo : num 5 8 7 8 14 10.5 11 18 15 13 ...
## $ sugars : int 6 8 5 0 8 10 14 8 6 5 ...
## $ potass : int 280 135 320 330 NA 70 30 100 125 190 ...
## $ vitamins: int 25 0 25 25 25 25 25 25 25 25 ...
## $ shelf : int 3 3 3 3 3 1 2 3 1 3 ...
## $ weight : num 1 1 1 1 1 1 1 1.33 1 1 ...
## $ cups : num 0.33 1 0.33 0.5 0.75 0.75 1 0.75 0.67 0.67 ...
## $ rating : num 68.4 34 59.4 93.7 34.4 ...
summary(cereals_data)
## name calories protein
## 100%_Bran : 1 Min. : 50.0 Min. :1.000
## 100%_Natural_Bran : 1 1st Qu.:100.0 1st Qu.:2.000
## All-Bran : 1 Median :110.0 Median :3.000
## All-Bran_with_Extra_Fiber: 1 Mean :106.9 Mean :2.545
## Almond_Delight : 1 3rd Qu.:110.0 3rd Qu.:3.000
## Apple_Cinnamon_Cheerios : 1 Max. :160.0 Max. :6.000
## (Other) :71
## fat sodium fiber carbo
## Min. :0.000 Min. : 0.0 Min. : 0.000 Min. : 5.0
## 1st Qu.:0.000 1st Qu.:130.0 1st Qu.: 1.000 1st Qu.:12.0
## Median :1.000 Median :180.0 Median : 2.000 Median :14.5
## Mean :1.013 Mean :159.7 Mean : 2.152 Mean :14.8
## 3rd Qu.:2.000 3rd Qu.:210.0 3rd Qu.: 3.000 3rd Qu.:17.0
## Max. :5.000 Max. :320.0 Max. :14.000 Max. :23.0
## NA's :1
## sugars potass vitamins shelf
## Min. : 0.000 Min. : 15.00 Min. : 0.00 Min. :1.000
## 1st Qu.: 3.000 1st Qu.: 42.50 1st Qu.: 25.00 1st Qu.:1.000
## Median : 7.000 Median : 90.00 Median : 25.00 Median :2.000
## Mean : 7.026 Mean : 98.67 Mean : 28.25 Mean :2.208
## 3rd Qu.:11.000 3rd Qu.:120.00 3rd Qu.: 25.00 3rd Qu.:3.000
## Max. :15.000 Max. :330.00 Max. :100.00 Max. :3.000
## NA's :1 NA's :2
## weight cups rating
## Min. :0.50 Min. :0.250 Min. :18.04
## 1st Qu.:1.00 1st Qu.:0.670 1st Qu.:33.17
## Median :1.00 Median :0.750 Median :40.40
## Mean :1.03 Mean :0.821 Mean :42.67
## 3rd Qu.:1.00 3rd Qu.:1.000 3rd Qu.:50.83
## Max. :1.50 Max. :1.500 Max. :93.70
##
The dataset has 77 observations of 14 variables
The column/variable names’ are self explanatory
#See the head and tail of the dataframe
head(cereals_data)
## name calories protein fat sodium fiber carbo sugars
## 1 100%_Bran 70 4 1 130 10.0 5.0 6
## 2 100%_Natural_Bran 120 3 5 15 2.0 8.0 8
## 3 All-Bran 70 4 1 260 9.0 7.0 5
## 4 All-Bran_with_Extra_Fiber 50 4 0 140 14.0 8.0 0
## 5 Almond_Delight 110 2 2 200 1.0 14.0 8
## 6 Apple_Cinnamon_Cheerios 110 2 2 180 1.5 10.5 10
## potass vitamins shelf weight cups rating
## 1 280 25 3 1 0.33 68.40297
## 2 135 0 3 1 1.00 33.98368
## 3 320 25 3 1 0.33 59.42551
## 4 330 25 3 1 0.50 93.70491
## 5 NA 25 3 1 0.75 34.38484
## 6 70 25 1 1 0.75 29.50954
tail(cereals_data)
## name calories protein fat sodium fiber carbo sugars
## 72 Total_Whole_Grain 100 3 1 200 3 16 3
## 73 Triples 110 2 1 250 0 21 3
## 74 Trix 110 1 1 140 0 13 12
## 75 Wheat_Chex 100 3 1 230 3 17 3
## 76 Wheaties 100 3 1 200 3 17 3
## 77 Wheaties_Honey_Gold 110 2 1 200 1 16 8
## potass vitamins shelf weight cups rating
## 72 110 100 3 1 1.00 46.65884
## 73 60 25 3 1 0.75 39.10617
## 74 25 25 2 1 1.00 27.75330
## 75 115 25 1 1 0.67 49.78744
## 76 110 25 1 1 1.00 51.59219
## 77 60 25 1 1 0.75 36.18756
#Store all column names in variable called 'attr'
attr <- colnames(cereals_data)
str(attr)
## chr [1:14] "name" "calories" "protein" "fat" "sodium" "fiber" "carbo" ...
#Store all categorical attributes in 'cat_Attr'
cat_attr <- "shelf"
#Now, how to find the numerical attributes?
num_attr <- setdiff(attr, c(cat_attr, "name"))
num_attr
## [1] "calories" "protein" "fat" "sodium" "fiber" "carbo"
## [7] "sugars" "potass" "vitamins" "weight" "cups" "rating"
cereals_data$shelf <- as.factor(as.character(cereals_data$shelf))
#Now see the structure of the dataframe
str(cereals_data)
## 'data.frame': 77 obs. of 14 variables:
## $ name : Factor w/ 77 levels "100%_Bran","100%_Natural_Bran",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ calories: int 70 120 70 50 110 110 110 130 90 90 ...
## $ protein : int 4 3 4 4 2 2 2 3 2 3 ...
## $ fat : int 1 5 1 0 2 2 0 2 1 0 ...
## $ sodium : int 130 15 260 140 200 180 125 210 200 210 ...
## $ fiber : num 10 2 9 14 1 1.5 1 2 4 5 ...
## $ carbo : num 5 8 7 8 14 10.5 11 18 15 13 ...
## $ sugars : int 6 8 5 0 8 10 14 8 6 5 ...
## $ potass : int 280 135 320 330 NA 70 30 100 125 190 ...
## $ vitamins: int 25 0 25 25 25 25 25 25 25 25 ...
## $ shelf : Factor w/ 3 levels "1","2","3": 3 3 3 3 3 1 2 3 1 3 ...
## $ weight : num 1 1 1 1 1 1 1 1.33 1 1 ...
## $ cups : num 0.33 1 0.33 0.5 0.75 0.75 1 0.75 0.67 0.67 ...
## $ rating : num 68.4 34 59.4 93.7 34.4 ...
rownames(cereals_data) <- cereals_data$name
#cereals_data <- cereals_data[, -c(colnames(cereals_data) %in% ("name"))]
# (or)
cereals_data$name <- NULL
sum(is.na(cereals_data))
## [1] 4
library(DMwR)
## Loading required package: lattice
## Loading required package: grid
sum(is.na(cereals_data$shelf))
## [1] 0
cereals_data[,num_attr] <- knnImputation(cereals_data[,num_attr], k = 3, scale = T)
sum(is.na(cereals_data))
## [1] 0
head(cereals_data)
## calories protein fat sodium fiber carbo sugars
## 100%_Bran 70 4 1 130 10.0 5.0 6
## 100%_Natural_Bran 120 3 5 15 2.0 8.0 8
## All-Bran 70 4 1 260 9.0 7.0 5
## All-Bran_with_Extra_Fiber 50 4 0 140 14.0 8.0 0
## Almond_Delight 110 2 2 200 1.0 14.0 8
## Apple_Cinnamon_Cheerios 110 2 2 180 1.5 10.5 10
## potass vitamins shelf weight cups rating
## 100%_Bran 280.00000 25 3 1 0.33 68.40297
## 100%_Natural_Bran 135.00000 0 3 1 1.00 33.98368
## All-Bran 320.00000 25 3 1 0.33 59.42551
## All-Bran_with_Extra_Fiber 330.00000 25 3 1 0.50 93.70491
## Almond_Delight 57.84562 25 3 1 0.75 34.38484
## Apple_Cinnamon_Cheerios 70.00000 25 1 1 0.75 29.50954
#How do you find missing values per column?
colSums(is.na(cereals_data))
## calories protein fat sodium fiber carbo sugars potass
## 0 0 0 0 0 0 0 0
## vitamins shelf weight cups rating
## 0 0 0 0 0
#Make a copy of the dataframe for later use (mixed attributes)
cereals_data_copy <- cereals_data
library(dummies)
## dummies-1.5.6 provided by Decision Patterns
shelfDummies <- data.frame(dummy(cereals_data$shelf))
#Name the new attributes appropriately
names(shelfDummies) <- c("Shelf1","Shelf2","Shelf3")
head(shelfDummies)
## Shelf1 Shelf2 Shelf3
## 1 0 0 1
## 2 0 0 1
## 3 0 0 1
## 4 0 0 1
## 5 0 0 1
## 6 1 0 0
cereals_data$shelf <- NULL
cereals_data <- data.frame(cbind(cereals_data, shelfDummies))
#check the dataframe using head()
head(cereals_data)
## calories protein fat sodium fiber carbo sugars
## 100%_Bran 70 4 1 130 10.0 5.0 6
## 100%_Natural_Bran 120 3 5 15 2.0 8.0 8
## All-Bran 70 4 1 260 9.0 7.0 5
## All-Bran_with_Extra_Fiber 50 4 0 140 14.0 8.0 0
## Almond_Delight 110 2 2 200 1.0 14.0 8
## Apple_Cinnamon_Cheerios 110 2 2 180 1.5 10.5 10
## potass vitamins weight cups rating Shelf1
## 100%_Bran 280.00000 25 1 0.33 68.40297 0
## 100%_Natural_Bran 135.00000 0 1 1.00 33.98368 0
## All-Bran 320.00000 25 1 0.33 59.42551 0
## All-Bran_with_Extra_Fiber 330.00000 25 1 0.50 93.70491 0
## Almond_Delight 57.84562 25 1 0.75 34.38484 0
## Apple_Cinnamon_Cheerios 70.00000 25 1 0.75 29.50954 1
## Shelf2 Shelf3
## 100%_Bran 0 1
## 100%_Natural_Bran 0 1
## All-Bran 0 1
## All-Bran_with_Extra_Fiber 0 1
## Almond_Delight 0 1
## Apple_Cinnamon_Cheerios 0 0
cereals_data[, num_attr] <- scale(cereals_data[,num_attr], center = T, scale = T)
#install.packages("factoextra")
# if(!require(devtools)) install.packages("devtools")
# devtools::install_github("kassambara/factoextra")
library(factoextra)
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.4.4
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
# Use the get_dist() function from the factoexrtra to calculate inter-observation distances
distance <- get_dist(cereals_data)
# The fviz_dist() function plots a visual representation of the inter-observation distances
fviz_dist(distance, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))
# The gradient argument, helps us define the color range for the distance scale
# We use the euclidean distance measure (all attributes are numerical now)
distM <- dist(cereals_data, method = "euclidean")
#distG <- dist(cereals_data, method = "gower")
hc_fit <- hclust(distM, method = "ward.D2")
#ward.D2 method - find the pair of clusters that leads to minimum increase in total within-cluster variance after merging
plot(hc_fit)
#Plot clusters being surrounded by a border, using the rect.hclust() function
rect.hclust(hc_fit, k = 6, border = "red")
points_hc <- cutree(hc_fit, k = 6)
# Store the clusters in a data frame along with the cereals data
cereals_clusts_hc <- cbind(points_hc, cereals_data)
# Have a look at the head of the new data frame
colnames(cereals_clusts_hc)[1] <- "cluster_hc"
head(cereals_clusts_hc)
## cluster_hc calories protein fat
## 100%_Bran 1 -1.8929836 1.3286071 -0.01290349
## 100%_Natural_Bran 2 0.6732089 0.4151897 3.96137277
## All-Bran 1 -1.8929836 1.3286071 -0.01290349
## All-Bran_with_Extra_Fiber 1 -2.9194605 1.3286071 -1.00647256
## Almond_Delight 3 0.1599704 -0.4982277 0.98066557
## Apple_Cinnamon_Cheerios 3 0.1599704 -0.4982277 0.98066557
## sodium fiber carbo sugars
## 100%_Bran -0.3539844 3.29284661 -2.5214018 -0.2298253
## 100%_Natural_Bran -1.7257708 -0.06375361 -1.7487994 0.2293718
## All-Bran 1.1967306 2.87327158 -2.0063336 -0.4594238
## All-Bran_with_Extra_Fiber -0.2346986 4.97114672 -1.7487994 -1.6074165
## Almond_Delight 0.4810160 -0.48332864 -0.2035948 0.2293718
## Apple_Cinnamon_Cheerios 0.2424445 -0.27354112 -1.1049642 0.6885689
## potass vitamins weight cups
## 100%_Bran 2.6078060 -0.1453172 -0.1967771 -2.1100340
## 100%_Natural_Bran 0.5262315 -1.2642598 -0.1967771 0.7690100
## All-Bran 3.1820334 -0.1453172 -0.1967771 -2.1100340
## All-Bran_with_Extra_Fiber 3.3255903 -0.1453172 -0.1967771 -1.3795303
## Almond_Delight -0.5813726 -0.1453172 -0.1967771 -0.3052601
## Apple_Cinnamon_Cheerios -0.4068881 -0.1453172 -0.1967771 -0.3052601
## rating Shelf1 Shelf2 Shelf3
## 100%_Bran 1.8321876 0 0 1
## 100%_Natural_Bran -0.6180571 0 0 1
## All-Bran 1.1930986 0 0 1
## All-Bran_with_Extra_Fiber 3.6333849 0 0 1
## Almond_Delight -0.5894990 0 0 1
## Apple_Cinnamon_Cheerios -0.9365625 1 0 0
#Scaling the numeric attributes
cereals_data_copy[,num_attr] <- scale(cereals_data_copy[,num_attr],scale=T,center=T)
#Calculating gower distance
library(cluster)
gower_dist = daisy(cereals_data_copy,metric = "gower")
head(gower_dist)
## [1] 0.33742882 0.06931482 0.14573742 0.30380269 0.37045842 0.40646494
class(gower_dist)
## [1] "dissimilarity" "dist"
#Now that you have the distance matrix, do the hclust()
hc_fit_mixed <- hclust(gower_dist, method = "ward.D2")
help("kmeans")
plot(hc_fit_mixed )
points_hc_mixed <- cutree(hc_fit_mixed , k = 6)
# Store the clusters in a data frame along with the cereals data
cereals_clusts_hc_mixed <- cbind(points_hc_mixed, cereals_data)
# Have a look at the head of the new data frame
colnames(cereals_clusts_hc_mixed)[1] <- "cluster_hc_mixed"
head(cereals_clusts_hc_mixed)
## cluster_hc_mixed calories protein
## 100%_Bran 1 -1.8929836 1.3286071
## 100%_Natural_Bran 2 0.6732089 0.4151897
## All-Bran 1 -1.8929836 1.3286071
## All-Bran_with_Extra_Fiber 1 -2.9194605 1.3286071
## Almond_Delight 2 0.1599704 -0.4982277
## Apple_Cinnamon_Cheerios 3 0.1599704 -0.4982277
## fat sodium fiber carbo
## 100%_Bran -0.01290349 -0.3539844 3.29284661 -2.5214018
## 100%_Natural_Bran 3.96137277 -1.7257708 -0.06375361 -1.7487994
## All-Bran -0.01290349 1.1967306 2.87327158 -2.0063336
## All-Bran_with_Extra_Fiber -1.00647256 -0.2346986 4.97114672 -1.7487994
## Almond_Delight 0.98066557 0.4810160 -0.48332864 -0.2035948
## Apple_Cinnamon_Cheerios 0.98066557 0.2424445 -0.27354112 -1.1049642
## sugars potass vitamins weight
## 100%_Bran -0.2298253 2.6078060 -0.1453172 -0.1967771
## 100%_Natural_Bran 0.2293718 0.5262315 -1.2642598 -0.1967771
## All-Bran -0.4594238 3.1820334 -0.1453172 -0.1967771
## All-Bran_with_Extra_Fiber -1.6074165 3.3255903 -0.1453172 -0.1967771
## Almond_Delight 0.2293718 -0.5813726 -0.1453172 -0.1967771
## Apple_Cinnamon_Cheerios 0.6885689 -0.4068881 -0.1453172 -0.1967771
## cups rating Shelf1 Shelf2 Shelf3
## 100%_Bran -2.1100340 1.8321876 0 0 1
## 100%_Natural_Bran 0.7690100 -0.6180571 0 0 1
## All-Bran -2.1100340 1.1930986 0 0 1
## All-Bran_with_Extra_Fiber -1.3795303 3.6333849 0 0 1
## Almond_Delight -0.3052601 -0.5894990 0 0 1
## Apple_Cinnamon_Cheerios -0.3052601 -0.9365625 1 0 0
plot(hc_fit_mixed)
rect.hclust(hc_fit_mixed, k = 5, border = "red")
set.seed(144423)
km_basic <- kmeans(cereals_data, centers = 2, nstart = 20)
fviz_cluster(km_basic, cereals_data)
The kmeans() function returns a list of 9 objects which include the cluster assignments (“cluster”), cluster centers (“centers”), etc. You can further explore the returned object by calling the str() function on the returned object and going through the documentation
Let’s now build a screen plot to choose a “k”
# Initialize wss to 0
wss <- 0
set.seed(2344)
# From 1 upto upto 10 cluster centers, fit the kmeans model
for (i in 1:10) {
cfit <- kmeans(cereals_data, centers = i, nstart = 20)
# Store the sum of within sum of square
wss[i] <- sum(cfit$withinss)
}
plot(1:10, wss, type = "b")
fviz_nbclust(cereals_data, kmeans, method = "wss")
set.seed(12344)
km_clust <- kmeans(cereals_data, 6)
# after choosing k as 6, let's store the cluster groupings along with the data in a new data frame
km_points <- km_clust$cluster
# Store the cluster assignments in a new data frame
cereals_clusts_km <- as.data.frame(cbind(km_clust$cluster, cereals_data))
# Look at the head of the data
head(cereals_clusts_km)
## km_clust$cluster calories protein
## 100%_Bran 4 -1.8929836 1.3286071
## 100%_Natural_Bran 5 0.6732089 0.4151897
## All-Bran 4 -1.8929836 1.3286071
## All-Bran_with_Extra_Fiber 4 -2.9194605 1.3286071
## Almond_Delight 1 0.1599704 -0.4982277
## Apple_Cinnamon_Cheerios 1 0.1599704 -0.4982277
## fat sodium fiber carbo
## 100%_Bran -0.01290349 -0.3539844 3.29284661 -2.5214018
## 100%_Natural_Bran 3.96137277 -1.7257708 -0.06375361 -1.7487994
## All-Bran -0.01290349 1.1967306 2.87327158 -2.0063336
## All-Bran_with_Extra_Fiber -1.00647256 -0.2346986 4.97114672 -1.7487994
## Almond_Delight 0.98066557 0.4810160 -0.48332864 -0.2035948
## Apple_Cinnamon_Cheerios 0.98066557 0.2424445 -0.27354112 -1.1049642
## sugars potass vitamins weight
## 100%_Bran -0.2298253 2.6078060 -0.1453172 -0.1967771
## 100%_Natural_Bran 0.2293718 0.5262315 -1.2642598 -0.1967771
## All-Bran -0.4594238 3.1820334 -0.1453172 -0.1967771
## All-Bran_with_Extra_Fiber -1.6074165 3.3255903 -0.1453172 -0.1967771
## Almond_Delight 0.2293718 -0.5813726 -0.1453172 -0.1967771
## Apple_Cinnamon_Cheerios 0.6885689 -0.4068881 -0.1453172 -0.1967771
## cups rating Shelf1 Shelf2 Shelf3
## 100%_Bran -2.1100340 1.8321876 0 0 1
## 100%_Natural_Bran 0.7690100 -0.6180571 0 0 1
## All-Bran -2.1100340 1.1930986 0 0 1
## All-Bran_with_Extra_Fiber -1.3795303 3.6333849 0 0 1
## Almond_Delight -0.3052601 -0.5894990 0 0 1
## Apple_Cinnamon_Cheerios -0.3052601 -0.9365625 1 0 0
colnames(cereals_clusts_km)[1] <- "cluster_km"
fviz_cluster(km_clust, cereals_data)